# load packages
library(readr)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(corrplot)
library(plotly)
library(reshape2)
library(car)
library(kableExtra)
library(broom)
library(knitr)
library(pROC)
library(ggpattern)
library(tidyr)
library(ResourceSelection)
library(sf)
library(arm)
library(xtable)
library(gridExtra)
# import dataset
female <- read_csv("/Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/Data/wm.csv")
# get a glimpse of dataset
head(female)
## # A tibble: 6 × 458
## HH1 HH2 LN WM1 WM2 WM3 WMINT WM4 WM5 WM6D WM6M WM6Y WM8
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2 3 1 2 3 92 91 92 19 11 2020 2
## 2 1 4 2 1 4 2 92 91 92 18 11 2020 1
## 3 1 9 4 1 9 4 92 91 92 18 11 2020 1
## 4 1 10 4 1 10 4 92 91 92 18 11 2020 2
## 5 1 11 4 1 11 4 92 91 92 19 11 2020 2
## 6 1 11 5 1 11 5 92 91 92 18 11 2020 2
## # ℹ 445 more variables: WM9 <dbl>, WM17 <dbl>, WM7H <dbl>, WM7M <dbl>,
## # WM10H <dbl>, WM10M <dbl>, WM11 <dbl>, WM12 <dbl>, WM13 <dbl>, WM14 <dbl>,
## # WM15 <dbl>, WM22 <dbl>, WM23 <dbl>, WM24 <dbl>, WMHINT <dbl>, WMFIN <dbl>,
## # WB3M <dbl>, WB3Y <dbl>, WB4 <dbl>, WB5 <dbl>, WB6A <dbl>, WB6B <dbl>,
## # WB7 <dbl>, WB9 <lgl>, WB10A <lgl>, WB10B <lgl>, WB11 <lgl>, WB12A <lgl>,
## # WB12B <lgl>, WB14 <dbl>, WB15 <dbl>, WB16 <dbl>, WB17 <dbl>, WB18 <dbl>,
## # WB19A <chr>, WB19B <chr>, WB19C <chr>, WB19D <chr>, WB19E <chr>, …
# Select the specified columns to create a new dataframe
female_df <- dplyr::select(female, WAGEM, MSTATUS, HH6, HH7, welevel, insurance, ethnicity, windex5, CP2, HA1, MT4, MT9, MT11)
# View the first few rows of the new dataframe
summary(female_df)
## WAGEM MSTATUS HH6 HH7 welevel
## Min. :10.00 Min. :1.000 Min. :1.000 Min. :1.000 Min. :0.00
## 1st Qu.:18.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.00
## Median :20.00 Median :1.000 Median :2.000 Median :3.000 Median :2.00
## Mean :20.92 Mean :1.413 Mean :1.683 Mean :3.404 Mean :2.46
## 3rd Qu.:23.00 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:5.000 3rd Qu.:3.00
## Max. :47.00 Max. :9.000 Max. :2.000 Max. :6.000 Max. :9.00
## NA's :2026 NA's :11 NA's :11 NA's :11 NA's :524
## insurance ethnicity windex5 CP2
## Min. :1.000 Min. :1.000 Min. :0.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :1.000 Median :1.000 Median :2.000 Median :1.000
## Mean :1.135 Mean :2.035 Mean :2.494 Mean :1.431
## 3rd Qu.:1.000 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :9.000 Max. :6.000 Max. :5.000 Max. :9.000
## NA's :524 NA's :864
## HA1 MT4 MT9 MT11
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.00
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.00
## Median :1.000 Median :2.000 Median :1.000 Median :1.00
## Mean :1.215 Mean :1.664 Mean :1.365 Mean :1.12
## 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:1.00
## Max. :9.000 Max. :9.000 Max. :9.000 Max. :9.00
## NA's :524 NA's :524 NA's :2496 NA's :524
# Rename the columns
female_df <- female_df %>%
rename(
age_first_marriage = WAGEM,
marital_status = MSTATUS,
area = HH6,
region = HH7,
education_level = welevel,
health_insurance = insurance,
ethnicity = ethnicity,
wealth_index = windex5,
current_contraceptive_use = CP2,
awareness_hiv_aids = HA1,
used_computer_tablet = MT4,
used_internet = MT9,
owns_mobile_phone = MT11
)
# Summarize missing values by column
summarize_missing <- sapply(female_df, function(x) sum(is.na(x)))
print(summarize_missing)
## age_first_marriage marital_status area
## 2026 11 11
## region education_level health_insurance
## 11 524 524
## ethnicity wealth_index current_contraceptive_use
## 0 0 864
## awareness_hiv_aids used_computer_tablet used_internet
## 524 524 2496
## owns_mobile_phone
## 524
# Recode the value 9 to NA for specified variables
female_df <- female_df %>%
mutate(
current_contraceptive_use = na_if(current_contraceptive_use, 9),
used_internet = na_if(used_internet, 9),
health_insurance = na_if(health_insurance, 9),
education_level = na_if(education_level, 9),
awareness_hiv_aids = na_if(awareness_hiv_aids, 9),
used_computer_tablet = na_if(used_computer_tablet, 9),
owns_mobile_phone = na_if(owns_mobile_phone, 9),
marital_status = na_if(marital_status, 9)
)
# Recode the value 6 to NA for 'ethnicity'
female_df$ethnicity <- na_if(female_df$ethnicity, 6)
# Recode the value 0 to NA for 'wealth_index'
female_df$wealth_index <- na_if(female_df$wealth_index, 0)
# Recoding variables from 1 (Yes) and 2 (No) to 1 (Yes) and 0 (No)
female_df$health_insurance <- ifelse(female_df$health_insurance == 2, 0, female_df$health_insurance)
female_df$current_contraceptive_use <- ifelse(female_df$current_contraceptive_use == 2, 0, female_df$current_contraceptive_use)
female_df$awareness_hiv_aids <- ifelse(female_df$awareness_hiv_aids == 2, 0, female_df$awareness_hiv_aids)
female_df$used_computer_tablet <- ifelse(female_df$used_computer_tablet == 2, 0, female_df$used_computer_tablet)
female_df$owns_mobile_phone <- ifelse(female_df$owns_mobile_phone == 2, 0, female_df$owns_mobile_phone)
female_df$used_internet <- ifelse(female_df$used_internet == 2, 0, female_df$used_internet)
# View the changes to ensure NA substitution has been correctly applied
summary(female_df)
## age_first_marriage marital_status area region
## Min. :10.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:18.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000
## Median :20.00 Median :1.000 Median :2.000 Median :3.000
## Mean :20.92 Mean :1.408 Mean :1.683 Mean :3.404
## 3rd Qu.:23.00 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:5.000
## Max. :47.00 Max. :3.000 Max. :2.000 Max. :6.000
## NA's :2026 NA's :18 NA's :11 NA's :11
## education_level health_insurance ethnicity wealth_index
## Min. :0.00 Min. :0.0000 Min. :1.000 Min. :1.000
## 1st Qu.:1.00 1st Qu.:1.0000 1st Qu.:1.000 1st Qu.:1.000
## Median :2.00 Median :1.0000 Median :1.000 Median :2.000
## Mean :2.46 Mean :0.8659 Mean :1.586 Mean :2.615
## 3rd Qu.:3.00 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :5.00 Max. :1.0000 Max. :4.000 Max. :5.000
## NA's :525 NA's :525 NA's :1148 NA's :524
## current_contraceptive_use awareness_hiv_aids used_computer_tablet
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000
## Median :1.0000 Median :1.0000 Median :0.0000
## Mean :0.5842 Mean :0.7975 Mean :0.3412
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :885 NA's :541 NA's :532
## used_internet owns_mobile_phone
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000
## Median :1.0000 Median :1.0000
## Mean :0.6394 Mean :0.8831
## 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000
## NA's :2501 NA's :529
Access to Media# Combine individual variables for access to the internet, phone, and computer into a single variable
# This new variable "access_to_media" will have a value of 1 if the respondent has access to any of these media sources, and 0 if not
# This provides a more comprehensive measure of media access
female_df <- female_df %>%
mutate(access_to_media = ifelse(used_computer_tablet == 1 | used_internet == 1 | owns_mobile_phone == 1, 1, 0))
# In later analysis, use access_to_media instead of the 3 separate variables
female_df <- female_df %>%
dplyr::select(-used_computer_tablet, -used_internet, -owns_mobile_phone)
# Removing rows where 'age_first_marriage' is NA in female_df
# This is because these values represent people who are not yet married, which is not the focus of the study
married_df <- female_df[!is.na(female_df$age_first_marriage), ]
summary(married_df)
## age_first_marriage marital_status area region
## Min. :10.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:18.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000
## Median :20.00 Median :1.000 Median :2.000 Median :3.000
## Mean :20.92 Mean :1.063 Mean :1.708 Mean :3.377
## 3rd Qu.:23.00 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:5.000
## Max. :47.00 Max. :2.000 Max. :2.000 Max. :6.000
##
## education_level health_insurance ethnicity wealth_index
## Min. :0.00 Min. :0.0000 Min. :1.000 Min. :1.000
## 1st Qu.:1.00 1st Qu.:1.0000 1st Qu.:1.000 1st Qu.:1.000
## Median :2.00 Median :1.0000 Median :1.000 Median :2.000
## Mean :2.29 Mean :0.8597 Mean :1.641 Mean :2.541
## 3rd Qu.:3.00 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :5.00 Max. :1.0000 Max. :4.000 Max. :5.000
## NA's :405 NA's :404 NA's :980 NA's :404
## current_contraceptive_use awareness_hiv_aids access_to_media
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:1.0000
## Median :1.0000 Median :1.0000 Median :1.0000
## Mean :0.7099 Mean :0.7756 Mean :0.8918
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## NA's :744 NA's :415 NA's :408
# Exporting female_df to a CSV file in the current working directory
#write.csv(married_df, "married_df.csv", row.names = FALSE)
# Histogram for 'Age at First Marriage'
afm_hist <- ggplot(married_df, aes(x = age_first_marriage)) +
geom_histogram(fill = "#F7C0C8", color = "black") +
theme_minimal() +
ggtitle("Histogram of Age at First Marriage") +
xlab("Age at First Marriage") +
ylab("Frequency")
print(afm_hist)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggsave("afm_hist.png", plot = afm_hist, width = 8, height = 6)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Before that, let's double check the count of missing values by column
count_missing <- sapply(married_df, function(x) sum(is.na(x)))
print(count_missing)
## age_first_marriage marital_status area
## 0 0 0
## region education_level health_insurance
## 0 405 404
## ethnicity wealth_index current_contraceptive_use
## 980 404 744
## awareness_hiv_aids access_to_media
## 415 408
# Make a copy of female_df for imputation
imputed_df <- married_df
# Define a function to calculate mode for categorical variables
getMode <- function(v) {
# The mode is the value that appears most frequently in the data
uniqv <- unique(na.omit(v)) # Omit NA values and get unique values
uniqv[which.max(tabulate(match(v, uniqv)))] # Return the value with the highest frequency
}
# For 'ethnicity', an ordinal variable with predefined categories, it makes sense to impute missing values with the mode.
mode_ethnicity <- getMode(imputed_df$ethnicity)
imputed_df <- mutate(imputed_df, ethnicity = ifelse(is.na(ethnicity), mode_ethnicity, ethnicity))
mode_area <- getMode(imputed_df$area)
imputed_df <- mutate(imputed_df, area = ifelse(is.na(area), mode_area, area))
mode_region <- getMode(imputed_df$region)
imputed_df <- mutate(imputed_df, region = ifelse(is.na(region), mode_region, region))
mode_marital_status <- getMode(imputed_df$marital_status)
imputed_df <- mutate(imputed_df, marital_status = ifelse(is.na(marital_status), mode_marital_status, marital_status))
# Binary variables like 'current_contraceptive_use', 'health_insurance','awareness_hiv_aids', 'used_internet', 'used_computer_tablet', 'owns_mobile_phone', and 'access_to_media'
# should be imputed with the mode since it represents the most frequent category (either 0 or 1).
# Calculate the mode for each binary variable
mode_current_contraceptive_use <- getMode(imputed_df$current_contraceptive_use)
mode_health_insurance <- getMode(imputed_df$health_insurance)
mode_awareness_hiv_aids <- getMode(imputed_df$awareness_hiv_aids)
mode_access_to_media <- getMode(imputed_df$access_to_media)
# Impute missing values for binary variables
imputed_df <- mutate(imputed_df,
current_contraceptive_use = ifelse(is.na(current_contraceptive_use), mode_current_contraceptive_use, current_contraceptive_use),
health_insurance = ifelse(is.na(health_insurance), mode_health_insurance, health_insurance),
awareness_hiv_aids = ifelse(is.na(awareness_hiv_aids), mode_awareness_hiv_aids, awareness_hiv_aids),
access_to_media = ifelse(is.na(access_to_media), mode_access_to_media, access_to_media)
)
# 'education_level' is an ordinal variable where the median could be a more suitable measure of central tendency than the mode.
# However, given the categorical nature of the levels (e.g., "Primary", "Secondary"), using the mode may still be appropriate.
mode_education_level <- getMode(imputed_df$education_level)
imputed_df <- mutate(imputed_df, education_level = ifelse(is.na(education_level), mode_education_level, education_level))
# 'wealth_index' is an ordinal variable where the median could be a more suitable measure of central tendency than the mode.
# However, given the categorical nature of the levels (e.g., "Poorest", "Poor",...), using the mode may still be appropriate.
mode_wealth_index <- getMode(imputed_df$wealth_index)
imputed_df <- mutate(imputed_df, wealth_index = ifelse(is.na(wealth_index), mode_wealth_index, wealth_index))
# Check the resulting dataset to confirm changes
summary(imputed_df)
## age_first_marriage marital_status area region
## Min. :10.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:18.00 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:2.000
## Median :20.00 Median :1.000 Median :2.000 Median :3.000
## Mean :20.92 Mean :1.063 Mean :1.708 Mean :3.377
## 3rd Qu.:23.00 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:5.000
## Max. :47.00 Max. :2.000 Max. :2.000 Max. :6.000
## education_level health_insurance ethnicity wealth_index
## Min. :0.000 Min. :0.0000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.0000 1st Qu.:1.000 1st Qu.:1.000
## Median :2.000 Median :1.0000 Median :1.000 Median :2.000
## Mean :2.278 Mean :0.8658 Mean :1.574 Mean :2.474
## 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:4.000
## Max. :5.000 Max. :1.0000 Max. :4.000 Max. :5.000
## current_contraceptive_use awareness_hiv_aids access_to_media
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:1.0000
## Median :1.0000 Median :1.0000 Median :1.0000
## Mean :0.7332 Mean :0.7856 Mean :0.8965
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
# After imputation, let's check for missing values
count_imputation <- sapply(imputed_df, function(x) sum(is.na(x)))
print(count_imputation)
## age_first_marriage marital_status area
## 0 0 0
## region education_level health_insurance
## 0 0 0
## ethnicity wealth_index current_contraceptive_use
## 0 0 0
## awareness_hiv_aids access_to_media
## 0 0
# Histogram for 'ethnicity' before imputation
ethnic <- ggplot(married_df, aes(x = ethnicity)) +
geom_histogram(fill = "#F7C0C8", color = "black", bins = 30) +
theme_light() +
ggtitle("Before Imputation") +
xlab("Ethnicity") +
ylab("Frequency")
# Histogram for 'ethnicity' after imputation
ethnic_imputed <- ggplot(imputed_df, aes(x = ethnicity)) +
geom_histogram(fill = "#E83853", color = "black", bins = 30) +
theme_light() +
ggtitle("After Imputation") +
xlab("Ethnicity") +
ylab("Frequency")
# Arrange the two plots side by side
grid.arrange(ethnic, ethnic_imputed, ncol = 2)
## Warning: Removed 980 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Histogram for 'education_level' before imputation
educ <- ggplot(married_df, aes(x = education_level)) +
geom_histogram(fill = "#F7C0C8", color = "black", bins = 30) +
theme_light() +
ggtitle("Before Imputation") +
xlab("Education Level") +
ylab("Frequency")
# Histogram for 'education_level' after imputation
educ_imputed <- ggplot(imputed_df, aes(x = education_level)) +
geom_histogram(fill = "#E83853", color = "black", bins = 30) +
theme_light() +
ggtitle("After Imputation") +
xlab("Education Level") +
ylab("Frequency")
# Arrange the two plots side by side
grid.arrange(educ, educ_imputed, ncol = 2)
## Warning: Removed 405 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Arrange the two plots side by side and capture the layout as a grob
#combined_plots <- arrangeGrob(afm_0, afm_imputed, ncol = 2)
# Now, use ggsave to save the combined plot
#ggsave("combined_age_first_marriage.png", plot = combined_plots, width = 10, height = 5)
# Convert "age at first marriage" into a binary variable to indicate child marriage
# Child marriage is defined as marriage before the age of 18
# The new binary variable "child_marriage" will have a value of 1 if the marriage occurred before age 18, and 0 otherwise
imputed_df <- imputed_df %>%
mutate(child_marriage = ifelse(age_first_marriage < 18, 1, 0))
# Create a binary variable for child marriage under 16
# The new variable "child_marriage_u16" will have a value of 1 if the marriage occurred before age 16, and 0 otherwise
imputed_df <- imputed_df %>%
mutate(child_marriage_u16 = ifelse(age_first_marriage < 16, 1, 0))
# Move "child_marriage" and "child_marriage_u16" to the front of the dataframe
imputed_df <- imputed_df %>%
dplyr::select(child_marriage, child_marriage_u16, everything())
# Exporting female_df to a CSV file in the current working directory
#write.csv(imputed_df, "imputed_df.csv", row.names = FALSE)
# Aggregate the data by region to get the total number of child marriages under 18 per region
region_counts <- aggregate(child_marriage ~ region, data = imputed_df, FUN = sum)
# Calculate the total number of child marriages under 18 in the dataset
total_child_marriages <- sum(region_counts$child_marriage)
# Calculate the percentage for each region
region_counts$married_u18_perc_of_total <- (region_counts$child_marriage / total_child_marriages) * 100
# Mapping region numbers to names
region_names <- c("Red River Delta", "Northern Midlands And Mountain",
"North Central And Central Coastal", "Central Highlands",
"South East", "Mekong River Delta")
names(region_counts)[1] <- "region_name"
region_counts$region_name <- factor(region_counts$region_name, levels = 1:6, labels = region_names)
# Display the final data frame
print(region_counts)
## region_name child_marriage married_u18_perc_of_total
## 1 Red River Delta 142 7.226463
## 2 Northern Midlands And Mountain 888 45.190840
## 3 North Central And Central Coastal 215 10.941476
## 4 Central Highlands 271 13.791349
## 5 South East 194 9.872774
## 6 Mekong River Delta 255 12.977099
# Updated mapping including all provinces and cities in the Red River Delta
province_to_region <- data.frame(
NAME_1 = c(
'Bắc Ninh', 'Hà Nam', 'Hà Nội', 'Hải Dương', 'Hải Phòng', 'Hoà Bình', 'Hưng Yên', 'Nam Định', 'Ninh Bình', 'Thái Bình', 'Vĩnh Phúc', # Red River Delta 11
'Bắc Giang', 'Bắc Kạn', 'Cao Bằng', 'Hà Giang', 'Lạng Sơn', 'Lào Cai', 'Phú Thọ', 'Quảng Ninh', 'Thái Nguyên', 'Tuyên Quang', 'Yên Bái', 'Điện Biên', 'Lai Châu', 'Sơn La', # Northern Midlands And Mountain 14
'Bình Định', 'Bình Thuận', 'Khánh Hòa', 'Ninh Thuận', 'Phú Yên', 'Quảng Nam', 'Quảng Ngãi', 'Thừa Thiên Huế', 'Đà Nẵng', 'Hà Tĩnh', 'Nghệ An', 'Quảng Bình', 'Quảng Trị', 'Thanh Hóa', # North Central And Central Coastal 14
'Đắk Lắk', 'Đắk Nông', 'Gia Lai', 'Kon Tum', 'Lâm Đồng', # Central Highlands 5
'Bà Rịa - Vũng Tàu', 'Bình Dương', 'Bình Phước', 'Đồng Nai', 'Hồ Chí Minh', 'Tây Ninh', # South East 6
'An Giang', 'Bạc Liêu', 'Bến Tre', 'Cà Mau', 'Cần Thơ', 'Đồng Tháp', 'Hậu Giang', 'Kiên Giang', 'Long An', 'Sóc Trăng', 'Tiền Giang', 'Trà Vinh', 'Vĩnh Long' # Mekong River Delta 13
),
Region = c(
rep('Red River Delta', 11),
rep('Northern Midlands And Mountain', 14),
rep('North Central And Central Coastal', 14),
rep('Central Highlands', 5),
rep('South East', 6),
rep('Mekong River Delta', 13)
)
)
# Check the mapping
#print(province_to_region)
# Read shapefile
vietnam_shape <- st_read('/Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/gadm41_VNM_shp')
## Multiple layers are present in data source /Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/gadm41_VNM_shp, reading layer `gadm41_VNM_2'.
## Use `st_layers' to list all layer names and their type in a data source.
## Set the `layer' argument in `st_read' to read a particular layer.
## Reading layer `gadm41_VNM_2' from data source
## `/Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/gadm41_VNM_shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 710 features and 13 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 102.1446 ymin: 8.381355 xmax: 109.4692 ymax: 23.39269
## Geodetic CRS: WGS 84
# Join the shapefile with the province-to-region mapping
vietnam_shape_with_region <- vietnam_shape %>%
left_join(province_to_region, by = "NAME_1")
# Aggregate the shapefile data by region
vietnam_regions <- vietnam_shape_with_region %>%
group_by(Region) %>%
summarise(geometry = st_union(geometry), .groups = 'drop')
# Join the aggregated shapefile data with the child marriage data
vietnam_map_data <- vietnam_regions %>%
left_join(region_counts, by = c("Region" = "region_name"))
# Define colors with your specific choices
colors_ordered <- setNames(c("#B20033", "#CD0A25", "#E83853", "#EF7D8D", "#F7C0C8", "#FBE1E5"),
c("Northern Midlands And Mountain", "Central Highlands",
"Mekong River Delta", "North Central And Central Coastal",
"South East", "Red River Delta"))
# Plot
mapvn <- ggplot(data = vietnam_map_data) +
geom_sf(aes(fill = factor(Region, levels = names(colors_ordered))), color = NA) +
geom_sf_text(aes(label = sprintf("%.1f%%", married_u18_perc_of_total)), size = 4, hjust = 0.5, vjust = 0.5) +
scale_fill_manual(values = colors_ordered, name = "Region") +
labs(title = "Regional Contribution of Female Child Marriage Rates to Total Rates in Vietnam") +
theme_void() +
theme(legend.position = "right")
print(mapvn)
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data
#ggsave("mapvn.png", plot = mapvn, width = 8, height = 6, dpi = 300)
total_child_marriages_u18 <- sum(imputed_df$child_marriage == 1)
# Aggregate counts for each category by region
counts_df <- aggregate(cbind(married_u18 = imputed_df$child_marriage == 1,
married_u16 = imputed_df$child_marriage_u16 == 1) ~ region,
data = imputed_df,
FUN = sum)
# Convert counts to percentages based on total child marriages under 18
counts_df$married_u18 <- (counts_df$married_u18 / total_child_marriages_u18) * 100
counts_df$married_u16 <- (counts_df$married_u16 / total_child_marriages_u18) * 100
p <- ggplot(counts_df, aes(x = factor(region))) +
geom_bar(aes(y = married_u18, fill = "Married before 18 years old"), stat = "identity", width = 0.5) +
geom_bar(aes(y = married_u16, fill = "Married before 16 years old"), stat = "identity", width = 0.5) +
# Adding text labels for married_u18
geom_text(aes(y = married_u18, label = sprintf("%.1f%%", married_u18)),
position = position_stack(vjust = 1.03),
size = 4, color = "black") +
# Adding text labels for married_u16
geom_text(aes(y = married_u16, label = sprintf("%.1f%%", married_u16)),
position = position_stack(vjust = 1.05),
size = 4, color = "black") +
scale_fill_manual(values = c("Married before 18 years old" = "#EF7D8D",
"Married before 16 years old" = "#B20016"),
name = "Marital Status") +
labs(x = "Region", y = "Percentage", title = "Marital Status Among Female Respondents by Region", element_text(size = 14)) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 18),
plot.margin = margin(t = 10, r = 10, b = 10, l = 10, unit = "mm"),
axis.text.x = element_text(size = 12, angle = 45, hjust = 1),
legend.position = c(0.85,0.8),
legend.text = element_text(size = 12),
legend.title = element_text(size = 14)) +
scale_x_discrete(labels = c("1" = "Red River Delta", "2" = "Northern Midlands And Mountain",
"3" = "North Central And Central Coastal", "4" = "Central Highlands",
"5" = "South East", "6" = "Mekong River Delta"))
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p)
#ggsave("marital_status.png", plot = p, width = 8, height = 7)
# Calculate correlation matrix
cor_matrix <- cor(imputed_df %>% select_if(is.numeric), use = "complete.obs")
# Melt the correlation matrix
melted_cor_matrix <- melt(cor_matrix)
# Generate an interactive heatmap
corr_matrix <- ggplot(melted_cor_matrix, aes(Var1, Var2, fill = value)) +
geom_tile() +
scale_fill_gradientn(
colours = c("deepskyblue", "white", "#CD0A25"),
values = scales::rescale(c(-1, 0, 1)),
limits = c(-1, 1),
name="Pearson\nCorrelation"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("") +
ylab("") +
ggtitle("Correlation Matrix")
# Convert ggplot object to plotly for interactivity
ggplotly(corr_matrix)
#write.csv(imputed_df, "imputed_df.csv", row.names = FALSE)
# Convert nominal and ordinal variables to factors
imputed_df$area <- as.factor(imputed_df$area)
imputed_df$region <- as.factor(imputed_df$region)
imputed_df$education_level <- factor(imputed_df$education_level, ordered = FALSE)
imputed_df$ethnicity <- as.factor(imputed_df$ethnicity)
imputed_df$wealth_index <- factor(imputed_df$wealth_index, ordered = FALSE)
# Binary variables are already in the correct format and can be used as is
# Baseline model for reference
baseline_model <- glm(child_marriage ~ area + education_level + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, family = binomial(), data = imputed_df)
# Summarize the baseline model
summary(baseline_model)
##
## Call:
## glm(formula = child_marriage ~ area + education_level + wealth_index +
## health_insurance + current_contraceptive_use + awareness_hiv_aids +
## access_to_media, family = binomial(), data = imputed_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.35056 0.13405 -2.615 0.00892 **
## area2 0.40764 0.08124 5.018 5.22e-07 ***
## education_level1 -0.36537 0.08856 -4.126 3.70e-05 ***
## education_level2 -0.43403 0.08711 -4.983 6.27e-07 ***
## education_level3 -1.11556 0.11658 -9.569 < 2e-16 ***
## education_level4 -3.14435 0.51246 -6.136 8.47e-10 ***
## education_level5 -2.92930 0.25430 -11.519 < 2e-16 ***
## wealth_index2 -0.54988 0.08114 -6.777 1.23e-11 ***
## wealth_index3 -0.93401 0.10076 -9.270 < 2e-16 ***
## wealth_index4 -0.76067 0.11045 -6.887 5.70e-12 ***
## wealth_index5 -1.02057 0.15040 -6.786 1.16e-11 ***
## health_insurance 0.11657 0.08174 1.426 0.15384
## current_contraceptive_use -0.03278 0.06298 -0.520 0.60280
## awareness_hiv_aids -0.45273 0.07058 -6.415 1.41e-10 ***
## access_to_media -0.02742 0.08280 -0.331 0.74051
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9576.1 on 9267 degrees of freedom
## Residual deviance: 8000.9 on 9253 degrees of freedom
## AIC: 8030.9
##
## Number of Fisher Scoring iterations: 7
# Enhanced model with additional fixed effects
enhanced_model <- glm(child_marriage ~ area + region + education_level + ethnicity + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media,
family = binomial(),
data = imputed_df)
# Summarize the new model with FEs
summary(enhanced_model)
##
## Call:
## glm(formula = child_marriage ~ area + region + education_level +
## ethnicity + wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media, family = binomial(),
## data = imputed_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.193525 0.182449 -6.542 6.08e-11 ***
## area2 0.268252 0.085814 3.126 0.00177 **
## region2 0.364102 0.130710 2.786 0.00534 **
## region3 0.162343 0.129158 1.257 0.20878
## region4 0.331093 0.127873 2.589 0.00962 **
## region5 -0.009249 0.127505 -0.073 0.94217
## region6 -0.041750 0.134893 -0.310 0.75694
## education_level1 0.023045 0.096277 0.239 0.81083
## education_level2 -0.031527 0.095643 -0.330 0.74167
## education_level3 -0.739558 0.124044 -5.962 2.49e-09 ***
## education_level4 -2.815602 0.514641 -5.471 4.47e-08 ***
## education_level5 -2.597688 0.257538 -10.087 < 2e-16 ***
## ethnicity2 -0.004214 0.106644 -0.040 0.96848
## ethnicity3 0.284476 0.130154 2.186 0.02884 *
## ethnicity4 1.200228 0.109158 10.995 < 2e-16 ***
## wealth_index2 -0.230751 0.086382 -2.671 0.00756 **
## wealth_index3 -0.612377 0.106964 -5.725 1.03e-08 ***
## wealth_index4 -0.432981 0.118543 -3.653 0.00026 ***
## wealth_index5 -0.685336 0.162011 -4.230 2.33e-05 ***
## health_insurance 0.002808 0.083641 0.034 0.97321
## current_contraceptive_use 0.031968 0.065069 0.491 0.62322
## awareness_hiv_aids -0.184659 0.077076 -2.396 0.01658 *
## access_to_media -0.081214 0.087354 -0.930 0.35252
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9576.1 on 9267 degrees of freedom
## Residual deviance: 7749.7 on 9245 degrees of freedom
## AIC: 7795.7
##
## Number of Fisher Scoring iterations: 7
# Adding the interaction term between area and education level
model_interaction <- update(enhanced_model, . ~ . + area:education_level)
summary(model_interaction)
##
## Call:
## glm(formula = child_marriage ~ area + region + education_level +
## ethnicity + wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media + area:education_level,
## family = binomial(), data = imputed_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.805452 0.304616 -2.644 0.008189 **
## area2 -0.148776 0.275355 -0.540 0.588986
## region2 0.353114 0.131039 2.695 0.007045 **
## region3 0.155871 0.129375 1.205 0.228281
## region4 0.331470 0.128087 2.588 0.009658 **
## region5 0.003210 0.127831 0.025 0.979965
## region6 -0.042289 0.134983 -0.313 0.754058
## education_level1 -0.154242 0.303997 -0.507 0.611889
## education_level2 -0.518954 0.283108 -1.833 0.066793 .
## education_level3 -1.183848 0.318697 -3.715 0.000203 ***
## education_level4 -3.614007 1.039464 -3.477 0.000507 ***
## education_level5 -2.978633 0.436539 -6.823 8.90e-12 ***
## ethnicity2 -0.003071 0.106724 -0.029 0.977042
## ethnicity3 0.268275 0.130228 2.060 0.039394 *
## ethnicity4 1.222054 0.110105 11.099 < 2e-16 ***
## wealth_index2 -0.243916 0.086709 -2.813 0.004907 **
## wealth_index3 -0.621079 0.107275 -5.790 7.06e-09 ***
## wealth_index4 -0.432211 0.118897 -3.635 0.000278 ***
## wealth_index5 -0.679620 0.163475 -4.157 3.22e-05 ***
## health_insurance 0.009255 0.083784 0.110 0.912046
## current_contraceptive_use 0.035227 0.065108 0.541 0.588473
## awareness_hiv_aids -0.192308 0.077046 -2.496 0.012560 *
## access_to_media -0.085099 0.087235 -0.976 0.329304
## area2:education_level1 0.176156 0.318702 0.553 0.580448
## area2:education_level2 0.547315 0.295447 1.852 0.063954 .
## area2:education_level3 0.499730 0.338962 1.474 0.140402
## area2:education_level4 1.023762 1.194225 0.857 0.391301
## area2:education_level5 0.409921 0.541866 0.756 0.449350
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9576.1 on 9267 degrees of freedom
## Residual deviance: 7743.7 on 9240 degrees of freedom
## AIC: 7799.7
##
## Number of Fisher Scoring iterations: 7
# Function to add significance asterisks
add_asterisks <- function(p_value) {
if (is.na(p_value)) {
return(NA)
} else if (p_value < 0.001) {
return("***")
} else if (p_value < 0.01) {
return("**")
} else if (p_value < 0.05) {
return("*")
} else {
return("")
}
}
# Function to format confidence intervals as a string
format_ci <- function(lower, upper) {
paste0("(", round(lower, 2), ", ", round(upper, 2), ")")
}
# Tidy the baseline model with confidence intervals
tidy_baseline <- tidy(baseline_model, conf.int = TRUE, exponentiate = TRUE)
# Tidy the enhanced model with confidence intervals
tidy_enhanced <- tidy(enhanced_model, conf.int = TRUE, exponentiate = TRUE)
# Tidy the interaction model with confidence intervals
tidy_interaction <- tidy(model_interaction, conf.int = TRUE, exponentiate = TRUE)
# Apply the function to each model's p.value
tidy_baseline$asterisks <- sapply(tidy_baseline$p.value, add_asterisks)
tidy_enhanced$asterisks <- sapply(tidy_enhanced$p.value, add_asterisks)
tidy_interaction$asterisks <- sapply(tidy_interaction$p.value, add_asterisks)
# Create OR strings with asterisks and format CIs as a string
tidy_baseline <- tidy_baseline %>%
mutate(
OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
)
tidy_enhanced <- tidy_enhanced %>%
mutate(
OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
)
tidy_interaction <- tidy_interaction %>%
mutate(
OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
)
# Add a 'Model' column to each tidied dataframe
tidy_baseline <- tidy_baseline %>% mutate(Model = "Model_1")
tidy_enhanced <- tidy_enhanced %>% mutate(Model = "Model_2")
tidy_interaction <- tidy_interaction %>% mutate(Model = "Model_3")
# Combine and pivot the dataframes
combined_results <- bind_rows(
tidy_baseline %>% dplyr::select(term, OR, CI, Model),
tidy_enhanced %>% dplyr::select(term, OR, CI, Model),
tidy_interaction %>% dplyr::select(term, OR, CI, Model)
) %>%
pivot_wider(names_from = Model, values_from = c(OR, CI))
# Replace NAs with "—"
combined_results[is.na(combined_results)] <- "—"
# Reordering columns to have OR and CI next to each other for each model
combined_results <- combined_results %>%
dplyr::select(term,
OR_Model_1, CI_Model_1,
OR_Model_2, CI_Model_2,
OR_Model_3, CI_Model_3)
# Print the final combined table
print(combined_results)
## # A tibble: 28 × 7
## term OR_Model_1 CI_Model_1 OR_Model_2 CI_Model_2 OR_Model_3 CI_Model_3
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 (Intercept) 0.7** (0.54, 0.… 0.3*** (0.21, 0.… 0.45** (0.24, 0.…
## 2 area2 1.5*** (1.28, 1.… 1.31** (1.11, 1.… 0.86 (0.51, 1.…
## 3 education_… 0.69*** (0.58, 0.… 1.02 (0.85, 1.… 0.86 (0.48, 1.…
## 4 education_… 0.65*** (0.55, 0.… 0.97 (0.8, 1.1… 0.6 (0.35, 1.…
## 5 education_… 0.33*** (0.26, 0.… 0.48*** (0.37, 0.… 0.31*** (0.16, 0.…
## 6 education_… 0.04*** (0.01, 0.… 0.06*** (0.02, 0.… 0.03*** (0, 0.14)
## 7 education_… 0.05*** (0.03, 0.… 0.07*** (0.04, 0.… 0.05*** (0.02, 0.…
## 8 wealth_ind… 0.58*** (0.49, 0.… 0.79** (0.67, 0.… 0.78** (0.66, 0.…
## 9 wealth_ind… 0.39*** (0.32, 0.… 0.54*** (0.44, 0.… 0.54*** (0.43, 0.…
## 10 wealth_ind… 0.47*** (0.38, 0.… 0.65*** (0.51, 0.… 0.65*** (0.51, 0.…
## # ℹ 18 more rows
# VIFs check (A VIF value > 5 indicates high multicollinearity)
# Base model
model_1_vif <- vif(baseline_model)
print(model_1_vif)
## GVIF Df GVIF^(1/(2*Df))
## area 1.116394 1 1.056596
## education_level 1.673060 5 1.052813
## wealth_index 1.418920 4 1.044708
## health_insurance 1.018544 1 1.009229
## current_contraceptive_use 1.015178 1 1.007560
## awareness_hiv_aids 1.482419 1 1.217546
## access_to_media 1.263256 1 1.123947
# Enhanced model
model_2_vif <- vif(enhanced_model)
print(model_2_vif)
## GVIF Df GVIF^(1/(2*Df))
## area 1.250833 1 1.118406
## region 4.773168 5 1.169178
## education_level 1.957540 5 1.069476
## ethnicity 4.140370 3 1.267185
## wealth_index 1.953237 4 1.087287
## health_insurance 1.049221 1 1.024315
## current_contraceptive_use 1.022997 1 1.011433
## awareness_hiv_aids 1.649486 1 1.284323
## access_to_media 1.310264 1 1.144668
# Interacton model
model_3_vif <- vif(model_interaction)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
print(model_3_vif)
## GVIF Df GVIF^(1/(2*Df))
## area 12.769190 1 3.573400
## region 4.828080 5 1.170516
## education_level 4107.374849 5 2.298034
## ethnicity 4.236501 3 1.272041
## wealth_index 2.014490 4 1.091492
## health_insurance 1.052067 1 1.025703
## current_contraceptive_use 1.023631 1 1.011747
## awareness_hiv_aids 1.650085 1 1.284556
## access_to_media 1.309351 1 1.144269
## area:education_level 5655.260451 5 2.372713
region and ethnicity,
with a mild increase in multicollinearity, but not at alarming
levels.area variable and its interaction with
education_level; their VIF values are significantly higher
(around 3.57, 2.3, and 2.37, respectively). Although these values are
below 5, the increase is substantial compared to the previous models and
could start to affect the reliability and interpretability of the
regression coefficients for these variables.# Model 1 vs. Model 2
anova(baseline_model, enhanced_model, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: child_marriage ~ area + education_level + wealth_index + health_insurance +
## current_contraceptive_use + awareness_hiv_aids + access_to_media
## Model 2: child_marriage ~ area + region + education_level + ethnicity +
## wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 9253 8000.9
## 2 9245 7749.7 8 251.12 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The Model 2 provides a significantly better fit to the data compared to the Model 1, as indicated by the large decrease in residual deviance and the very small p-value (< 2.2e-16).
# Model 1 vs. Model 3
anova(baseline_model, model_interaction, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: child_marriage ~ area + education_level + wealth_index + health_insurance +
## current_contraceptive_use + awareness_hiv_aids + access_to_media
## Model 2: child_marriage ~ area + region + education_level + ethnicity +
## wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media + area:education_level
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 9253 8000.9
## 2 9240 7743.7 13 257.14 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The Model 3 provides a significantly better fit to the data compared to the Model 1, as indicated by the large decrease in residual deviance and the very small p-value (< 2.2e-16).
# Model 2 vs. Model 3
anova(enhanced_model, model_interaction, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: child_marriage ~ area + region + education_level + ethnicity +
## wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media
## Model 2: child_marriage ~ area + region + education_level + ethnicity +
## wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media + area:education_level
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 9245 7749.7
## 2 9240 7743.7 5 6.0212 0.3042
The p-value is not significant.
# Create a data frame to hold AIC and BIC values
aic_bic_comparison <- data.frame(
Model = c("Model 1", "Model 2", "Model 3"),
AIC = c(AIC(baseline_model), AIC(enhanced_model), AIC(model_interaction)),
BIC = c(BIC(baseline_model), BIC(enhanced_model), BIC(model_interaction))
)
# Print the table
print(aic_bic_comparison)
## Model AIC BIC
## 1 Model 1 8030.853 8137.868
## 2 Model 2 7795.737 7959.826
## 3 Model 3 7799.716 7999.477
Model 2 has the lowest AIC and BIC values of all, indicating that it might be the best model among the three in terms of balancing fit and complexity.
# 1. Hosmer-Lemeshow Test for the Model 1
hoslem.test(baseline_model$y, fitted(baseline_model), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: baseline_model$y, fitted(baseline_model)
## X-squared = 12.943, df = 8, p-value = 0.1138
# 2. Hosmer-Lemeshow Test for the Model 2 (Baseline + Fixed Effects)
hoslem.test(enhanced_model$y, fitted(enhanced_model), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: enhanced_model$y, fitted(enhanced_model)
## X-squared = 5.3687, df = 8, p-value = 0.7175
# 3. Hosmer-Lemeshow Test for the Model 3 (Baseline + Fixed Effects + Interaction Terms)
hoslem.test(model_interaction$y, fitted(model_interaction), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: model_interaction$y, fitted(model_interaction)
## X-squared = 6.1988, df = 8, p-value = 0.625
A large p-value (>0.05) indicates a good fit, meaning that there’s no significant difference between the observed and predicted values. Through each model, the p-value increases which suggests that our decision to include fixed effects and interaction terms are significant.
binnedplot(fitted(model_interaction),
residuals(model_interaction, type = "response"),
nclass = NULL,
xlab = "Expected Values",
ylab = "Average Residuals",
main = "Binned Residual Plot",
cex.pts = 1,
col.int = "gray")
invisible(plot(roc(imputed_df$child_marriage,
fitted(baseline_model)),
col = "red",
main = "ROC Curve: \nModel I (Red) vs. Model II (Green) vs. Model III (Blue)"))
invisible(plot(roc(imputed_df$child_marriage,
fitted(enhanced_model)),
col = "green",
add = T))
invisible(plot(roc(imputed_df$child_marriage,
fitted(model_interaction)),
print.auc = T,
col = "blue",
add = T))
# For each model
roc_response_model_1 <- roc(imputed_df$child_marriage, fitted(baseline_model))
auc_model_1 <- auc(roc_response_model_1)
roc_response_model_2 <- roc(imputed_df$child_marriage, fitted(enhanced_model))
auc_model_2 <- auc(roc_response_model_2)
roc_response_model_3 <- roc(imputed_df$child_marriage, fitted(model_interaction))
auc_model_3 <- auc(roc_response_model_3)
# Compare AUC values
print(paste("AUC Model 1:", auc_model_1))
## [1] "AUC Model 1: 0.772320518006648"
print(paste("AUC Model 2:", auc_model_2))
## [1] "AUC Model 2: 0.786801269233356"
print(paste("AUC Model 3:", auc_model_3))
## [1] "AUC Model 3: 0.787500065329211"
An AUC of 0.5 represents a model with no discriminative ability (akin to random guessing), while an AUC of 1.0 represents perfect discrimination. So, my model is performing substantially better than random guessing.
The AUC of 0.7 indicates that the Model 1 has good discriminative ability. In other words, it is capable of distinguishing between cases and controls with a high degree of accuracy.
This model shows a slight improvement in AUC over the Model 1. The increase suggests that the additional variables (or adjustments) I made in the Model 2 contribute positively to its ability to differentiate between cases and controls. The difference in AUC between the Model 1 and Model 2, while modest, is still meaningful, especially in practical, real-world contexts.
The Model 3 shows a very slight improvement in AUC over the Model 2. This indicates that adding interaction terms provides a marginal improvement in the model’s discriminatory power. However, the improvement is very minimal, which aligns with my earlier findings that the interaction terms did not significantly improve the model fit.
All models demonstrate good ability to distinguish between cases and controls. An AUC greater than 0.7 is generally considered acceptable, and my models are above 0.7 and close to 0.8. The Model 2 and Model 3 only show marginal improvements in AUC compared to the Model 1. This suggests that while the additional complexity (more variables and interaction terms) does contribute slightly to model performance, the gains are not substantial.
# No imputation -- Remove rows with any missing data from 'female_df'
og_df <- na.omit(married_df)
print(og_df)
## # A tibble: 7,591 × 11
## age_first_marriage marital_status area region education_level
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 23 1 1 1 5
## 2 17 1 1 1 2
## 3 23 1 1 1 5
## 4 22 1 1 1 5
## 5 26 1 1 1 5
## 6 23 1 1 1 5
## 7 27 1 1 1 4
## 8 25 1 1 1 5
## 9 21 1 1 1 3
## 10 25 1 1 1 5
## # ℹ 7,581 more rows
## # ℹ 6 more variables: health_insurance <dbl>, ethnicity <dbl>,
## # wealth_index <dbl>, current_contraceptive_use <dbl>,
## # awareness_hiv_aids <dbl>, access_to_media <dbl>
# Convert "age at first marriage" into a binary variable to indicate child marriage
# Child marriage is defined as marriage before the age of 18
# The new binary variable "child_marriage" will have a value of 1 if the marriage occurred before age 18, and 0 otherwise
og_df <- og_df %>%
mutate(child_marriage = ifelse(age_first_marriage < 18, 1, 0))
# Create a binary variable for child marriage under 16
# The new variable "child_marriage_u16" will have a value of 1 if the marriage occurred before age 16, and 0 otherwise
og_df <- og_df %>%
mutate(child_marriage_u16 = ifelse(age_first_marriage < 16, 1, 0))
# Move "child_marriage" and "child_marriage_u16" to the front of the dataframe
og_df <- og_df %>%
dplyr::select(child_marriage, child_marriage_u16, everything())
# Convert nominal and ordinal variables to factors
og_df$area <- as.factor(og_df$area)
og_df$region <- as.factor(og_df$region)
og_df$education_level <- factor(og_df$education_level, ordered = FALSE)
og_df$ethnicity <- as.factor(og_df$ethnicity)
og_df$wealth_index <- factor(og_df$wealth_index, ordered = FALSE)
# Binary variables are already in the correct format and can be used as is
#write.csv(og_df, "og_df.csv", row.names = FALSE)
# Base model
base_model_no_impute <- glm(child_marriage ~ area + education_level + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, family = binomial(), data = og_df)
# Summarize the base model
summary(base_model_no_impute)
##
## Call:
## glm(formula = child_marriage ~ area + education_level + wealth_index +
## health_insurance + current_contraceptive_use + awareness_hiv_aids +
## access_to_media, family = binomial(), data = og_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.12285 0.15692 -0.783 0.43370
## area2 0.26196 0.09295 2.818 0.00483 **
## education_level1 -0.43948 0.10272 -4.278 1.88e-05 ***
## education_level2 -0.51284 0.10331 -4.964 6.90e-07 ***
## education_level3 -1.27682 0.13682 -9.332 < 2e-16 ***
## education_level4 -3.38049 0.59104 -5.720 1.07e-08 ***
## education_level5 -3.10583 0.27962 -11.107 < 2e-16 ***
## wealth_index2 -0.61689 0.09276 -6.651 2.92e-11 ***
## wealth_index3 -0.95525 0.11163 -8.558 < 2e-16 ***
## wealth_index4 -0.75531 0.12062 -6.262 3.80e-10 ***
## wealth_index5 -1.01614 0.15960 -6.367 1.93e-10 ***
## health_insurance 0.09667 0.09195 1.051 0.29308
## current_contraceptive_use -0.08254 0.06980 -1.182 0.23704
## awareness_hiv_aids -0.47372 0.08239 -5.750 8.94e-09 ***
## access_to_media 0.01077 0.10367 0.104 0.91726
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7404.1 on 7590 degrees of freedom
## Residual deviance: 6086.8 on 7576 degrees of freedom
## AIC: 6116.8
##
## Number of Fisher Scoring iterations: 7
# Enhanced model with additional fixed effects
enhanced_model_no_impute <- glm(child_marriage ~ area + region + education_level + ethnicity + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, family = binomial(), data = og_df)
# Summarize the new model with FEs
summary(enhanced_model_no_impute)
##
## Call:
## glm(formula = child_marriage ~ area + region + education_level +
## ethnicity + wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media, family = binomial(),
## data = og_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.6510890 0.2290549 -7.208 5.67e-13 ***
## area2 0.1470706 0.0980382 1.500 0.13358
## region2 0.0495982 0.1566174 0.317 0.75148
## region3 -0.0227665 0.1516838 -0.150 0.88069
## region4 -0.1639096 0.1668163 -0.983 0.32582
## region5 -0.0308200 0.1398462 -0.220 0.82557
## region6 0.0503335 0.1461564 0.344 0.73056
## education_level1 0.1358036 0.1169557 1.161 0.24558
## education_level2 0.0826556 0.1189010 0.695 0.48695
## education_level3 -0.7178808 0.1495591 -4.800 1.59e-06 ***
## education_level4 -2.8361281 0.5948134 -4.768 1.86e-06 ***
## education_level5 -2.5784165 0.2856381 -9.027 < 2e-16 ***
## ethnicity2 0.5559934 0.1376108 4.040 5.34e-05 ***
## ethnicity3 0.3371900 0.1396231 2.415 0.01574 *
## ethnicity4 1.9238662 0.1608494 11.961 < 2e-16 ***
## wealth_index2 -0.0532348 0.1077890 -0.494 0.62139
## wealth_index3 -0.3666087 0.1293455 -2.834 0.00459 **
## wealth_index4 -0.1573256 0.1414044 -1.113 0.26588
## wealth_index5 -0.4268584 0.1831853 -2.330 0.01980 *
## health_insurance -0.0958692 0.0954439 -1.004 0.31516
## current_contraceptive_use 0.0008332 0.0728182 0.011 0.99087
## awareness_hiv_aids -0.0914918 0.0944656 -0.969 0.33278
## access_to_media 0.1818904 0.1103596 1.648 0.09932 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7404.1 on 7590 degrees of freedom
## Residual deviance: 5849.7 on 7568 degrees of freedom
## AIC: 5895.7
##
## Number of Fisher Scoring iterations: 7
# Adding the interaction term between area and education level
interaction_model_no_impute <- update(enhanced_model_no_impute, . ~ . + area:education_level)
summary(interaction_model_no_impute)
##
## Call:
## glm(formula = child_marriage ~ area + region + education_level +
## ethnicity + wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media + area:education_level,
## family = binomial(), data = og_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.44930 0.39341 -3.684 0.00023 ***
## area2 -0.07873 0.36089 -0.218 0.82730
## region2 0.04910 0.15677 0.313 0.75412
## region3 -0.02527 0.15186 -0.166 0.86783
## region4 -0.15765 0.16694 -0.944 0.34499
## region5 -0.01975 0.14022 -0.141 0.88798
## region6 0.05481 0.14624 0.375 0.70781
## education_level1 0.13163 0.37724 0.349 0.72714
## education_level2 -0.19969 0.36500 -0.547 0.58431
## education_level3 -1.04485 0.39533 -2.643 0.00822 **
## education_level4 -3.26882 1.06401 -3.072 0.00213 **
## education_level5 -2.69970 0.49148 -5.493 3.95e-08 ***
## ethnicity2 0.55976 0.13773 4.064 4.82e-05 ***
## ethnicity3 0.32597 0.13973 2.333 0.01965 *
## ethnicity4 1.94289 0.16204 11.990 < 2e-16 ***
## wealth_index2 -0.05987 0.10795 -0.555 0.57918
## wealth_index3 -0.36871 0.12941 -2.849 0.00438 **
## wealth_index4 -0.15023 0.14140 -1.062 0.28805
## wealth_index5 -0.41743 0.18406 -2.268 0.02334 *
## health_insurance -0.09355 0.09555 -0.979 0.32754
## current_contraceptive_use 0.00196 0.07286 0.027 0.97854
## awareness_hiv_aids -0.09512 0.09437 -1.008 0.31351
## access_to_media 0.17945 0.11022 1.628 0.10349
## area2:education_level1 -0.01759 0.39520 -0.045 0.96450
## area2:education_level2 0.32521 0.38034 0.855 0.39252
## area2:education_level3 0.39737 0.42012 0.946 0.34423
## area2:education_level4 0.59025 1.28428 0.460 0.64581
## area2:education_level5 0.03787 0.62518 0.061 0.95170
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7404.1 on 7590 degrees of freedom
## Residual deviance: 5846.0 on 7563 degrees of freedom
## AIC: 5902
##
## Number of Fisher Scoring iterations: 7
# Extracting data from the models
extract_model_data <- function(model) {
model_summary <- summary(model)
coeffs <- model_summary$coefficients
data.frame(
Term = rownames(coeffs),
Estimate = sprintf("%.3f", coeffs[, "Estimate"]),
pValue = ifelse(coeffs[, "Pr(>|z|)"] < 0.001,
format(coeffs[, "Pr(>|z|)"], scientific = TRUE),
sprintf("%.3f", coeffs[, "Pr(>|z|)"])),
Significance = sapply(coeffs[, "Pr(>|z|)"], add_asterisks)
)
}
# Applying the function to each model
base_impute_data <- extract_model_data(baseline_model)
base_no_impute_data <- extract_model_data(base_model_no_impute)
# Combine the data for comparison
base_comparison_data <- merge(base_impute_data, base_no_impute_data, by = "Term", suffixes = c("_Impute", "_NoImpute"), sort = FALSE)
# View the comparison
print(base_comparison_data)
## Term Estimate_Impute pValue_Impute Significance_Impute
## 1 (Intercept) -0.351 0.009 **
## 2 area2 0.408 5.222876e-07 ***
## 3 education_level1 -0.365 3.696088e-05 ***
## 4 education_level2 -0.434 6.270603e-07 ***
## 5 education_level3 -1.116 1.076550e-21 ***
## 6 education_level4 -3.144 8.474392e-10 ***
## 7 education_level5 -2.929 1.060085e-30 ***
## 8 wealth_index2 -0.550 1.230578e-11 ***
## 9 wealth_index3 -0.934 1.867146e-20 ***
## 10 wealth_index4 -0.761 5.702313e-12 ***
## 11 wealth_index5 -1.021 1.156489e-11 ***
## 12 health_insurance 0.117 0.154
## 13 current_contraceptive_use -0.033 0.603
## 14 awareness_hiv_aids -0.453 1.412117e-10 ***
## 15 access_to_media -0.027 0.741
## Estimate_NoImpute pValue_NoImpute Significance_NoImpute
## 1 -0.123 0.434
## 2 0.262 0.005 **
## 3 -0.439 1.881675e-05 ***
## 4 -0.513 6.901996e-07 ***
## 5 -1.277 1.036155e-20 ***
## 6 -3.380 1.067977e-08 ***
## 7 -3.106 1.155012e-28 ***
## 8 -0.617 2.917388e-11 ***
## 9 -0.955 1.153337e-17 ***
## 10 -0.755 3.797438e-10 ***
## 11 -1.016 1.931436e-10 ***
## 12 0.097 0.293
## 13 -0.083 0.237
## 14 -0.474 8.940563e-09 ***
## 15 0.011 0.917
# Applying the function to each model
enhanced_impute_data <- extract_model_data(enhanced_model)
enhanced_no_impute_data <- extract_model_data(enhanced_model_no_impute)
# Combine the data for comparison
enhanced_comparison_data <- merge(enhanced_impute_data, enhanced_no_impute_data, by = "Term", suffixes = c("_Impute", "_NoImpute"), sort = FALSE)
# View the comparison
print(enhanced_comparison_data)
## Term Estimate_Impute pValue_Impute Significance_Impute
## 1 (Intercept) -1.194 6.082446e-11 ***
## 2 area2 0.268 0.002 **
## 3 region2 0.364 0.005 **
## 4 region3 0.162 0.209
## 5 region4 0.331 0.010 **
## 6 region5 -0.009 0.942
## 7 region6 -0.042 0.757
## 8 education_level1 0.023 0.811
## 9 education_level2 -0.032 0.742
## 10 education_level3 -0.740 2.491054e-09 ***
## 11 education_level4 -2.816 4.474974e-08 ***
## 12 education_level5 -2.598 6.330343e-24 ***
## 13 ethnicity2 -0.004 0.968
## 14 ethnicity3 0.284 0.029 *
## 15 ethnicity4 1.200 4.023378e-28 ***
## 16 wealth_index2 -0.231 0.008 **
## 17 wealth_index3 -0.612 1.033996e-08 ***
## 18 wealth_index4 -0.433 2.596704e-04 ***
## 19 wealth_index5 -0.685 2.334946e-05 ***
## 20 health_insurance 0.003 0.973
## 21 current_contraceptive_use 0.032 0.623
## 22 awareness_hiv_aids -0.185 0.017 *
## 23 access_to_media -0.081 0.353
## Estimate_NoImpute pValue_NoImpute Significance_NoImpute
## 1 -1.651 5.666765e-13 ***
## 2 0.147 0.134
## 3 0.050 0.751
## 4 -0.023 0.881
## 5 -0.164 0.326
## 6 -0.031 0.826
## 7 0.050 0.731
## 8 0.136 0.246
## 9 0.083 0.487
## 10 -0.718 1.586814e-06 ***
## 11 -2.836 1.859743e-06 ***
## 12 -2.578 1.766623e-19 ***
## 13 0.556 5.337557e-05 ***
## 14 0.337 0.016 *
## 15 1.924 5.710335e-33 ***
## 16 -0.053 0.621
## 17 -0.367 0.005 **
## 18 -0.157 0.266
## 19 -0.427 0.020 *
## 20 -0.096 0.315
## 21 0.001 0.991
## 22 -0.091 0.333
## 23 0.182 0.099
# Applying the function to each model
interaction_impute_data <- extract_model_data(model_interaction)
interaction_no_impute_data <- extract_model_data(interaction_model_no_impute)
# Combine the data for comparison
interaction_comparison_data <- merge(interaction_impute_data, interaction_no_impute_data, by = "Term", suffixes = c("_Impute", "_NoImpute"), sort = FALSE)
# View the comparison
print(interaction_comparison_data)
## Term Estimate_Impute pValue_Impute Significance_Impute
## 1 (Intercept) -0.805 0.008 **
## 2 area2 -0.149 0.589
## 3 region2 0.353 0.007 **
## 4 region3 0.156 0.228
## 5 region4 0.331 0.010 **
## 6 region5 0.003 0.980
## 7 region6 -0.042 0.754
## 8 education_level1 -0.154 0.612
## 9 education_level2 -0.519 0.067
## 10 education_level3 -1.184 2.034807e-04 ***
## 11 education_level4 -3.614 5.074377e-04 ***
## 12 education_level5 -2.979 8.897857e-12 ***
## 13 ethnicity2 -0.003 0.977
## 14 ethnicity3 0.268 0.039 *
## 15 ethnicity4 1.222 1.268874e-28 ***
## 16 wealth_index2 -0.244 0.005 **
## 17 wealth_index3 -0.621 7.055082e-09 ***
## 18 wealth_index4 -0.432 2.777919e-04 ***
## 19 wealth_index5 -0.680 3.219988e-05 ***
## 20 health_insurance 0.009 0.912
## 21 current_contraceptive_use 0.035 0.588
## 22 awareness_hiv_aids -0.192 0.013 *
## 23 access_to_media -0.085 0.329
## 24 area2:education_level1 0.176 0.580
## 25 area2:education_level2 0.547 0.064
## 26 area2:education_level3 0.500 0.140
## 27 area2:education_level4 1.024 0.391
## 28 area2:education_level5 0.410 0.449
## Estimate_NoImpute pValue_NoImpute Significance_NoImpute
## 1 -1.449 2.296503e-04 ***
## 2 -0.079 0.827
## 3 0.049 0.754
## 4 -0.025 0.868
## 5 -0.158 0.345
## 6 -0.020 0.888
## 7 0.055 0.708
## 8 0.132 0.727
## 9 -0.200 0.584
## 10 -1.045 0.008 **
## 11 -3.269 0.002 **
## 12 -2.700 3.951423e-08 ***
## 13 0.560 4.817267e-05 ***
## 14 0.326 0.020 *
## 15 1.943 4.015586e-33 ***
## 16 -0.060 0.579
## 17 -0.369 0.004 **
## 18 -0.150 0.288
## 19 -0.417 0.023 *
## 20 -0.094 0.328
## 21 0.002 0.979
## 22 -0.095 0.314
## 23 0.179 0.103
## 24 -0.018 0.965
## 25 0.325 0.393
## 26 0.397 0.344
## 27 0.590 0.646
## 28 0.038 0.952
# Tidy the baseline model with confidence intervals
tidy_base_no_impute <- tidy(base_model_no_impute, conf.int = TRUE, exponentiate = TRUE)
# Tidy the enhanced model with confidence intervals
tidy_enhanced_no_impute <- tidy(enhanced_model_no_impute, conf.int = TRUE, exponentiate = TRUE)
# Tidy the interaction model with confidence intervals
tidy_interaction_no_impute <- tidy(interaction_model_no_impute, conf.int = TRUE, exponentiate = TRUE)
# Apply the function to each model's p.value
tidy_base_no_impute$asterisks <- sapply(tidy_base_no_impute$p.value, add_asterisks)
tidy_enhanced_no_impute$asterisks <- sapply(tidy_enhanced_no_impute$p.value, add_asterisks)
tidy_interaction_no_impute$asterisks <- sapply(tidy_interaction_no_impute$p.value, add_asterisks)
# Create OR strings with asterisks and format CIs as a string
tidy_base_no_impute <- tidy_base_no_impute %>%
mutate(
OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
)
tidy_enhanced_no_impute <- tidy_enhanced_no_impute %>%
mutate(
OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
)
tidy_interaction_no_impute <- tidy_interaction_no_impute %>%
mutate(
OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
)
# Add a 'Model' column to each tidied dataframe
tidy_base_no_impute <- tidy_base_no_impute %>% mutate(Model = "Model_1")
tidy_enhanced_no_impute <- tidy_enhanced_no_impute %>% mutate(Model = "Model_2")
tidy_interaction_no_impute <- tidy_interaction_no_impute %>% mutate(Model = "Model_3")
# Combine and pivot the dataframes
combined_no_impute <- bind_rows(
tidy_base_no_impute %>% dplyr::select(term, OR, CI, Model),
tidy_enhanced_no_impute %>% dplyr::select(term, OR, CI, Model),
tidy_interaction_no_impute %>% dplyr::select(term, OR, CI, Model)
) %>%
pivot_wider(names_from = Model, values_from = c(OR, CI))
# Replace NAs with "—"
combined_no_impute[is.na(combined_no_impute)] <- "—"
# Reordering columns to have OR and CI next to each other for each model
combined_no_impute <- combined_no_impute %>%
dplyr::select(term,
OR_Model_1, CI_Model_1,
OR_Model_2, CI_Model_2,
OR_Model_3, CI_Model_3)
# Print the final combined table
print(combined_no_impute)
## # A tibble: 28 × 7
## term OR_Model_1 CI_Model_1 OR_Model_2 CI_Model_2 OR_Model_3 CI_Model_3
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 (Intercept) 0.88 (0.65, 1.… 0.19*** (0.12, 0.… 0.23*** (0.1, 0.4…
## 2 area2 1.3** (1.08, 1.… 1.16 (0.96, 1.… 0.92 (0.47, 1.…
## 3 education_… 0.64*** (0.53, 0.… 1.15 (0.91, 1.… 1.14 (0.56, 2.…
## 4 education_… 0.6*** (0.49, 0.… 1.09 (0.86, 1.… 0.82 (0.41, 1.…
## 5 education_… 0.28*** (0.21, 0.… 0.49*** (0.36, 0.… 0.35** (0.17, 0.…
## 6 education_… 0.03*** (0.01, 0.… 0.06*** (0.01, 0.… 0.04** (0, 0.21)
## 7 education_… 0.04*** (0.02, 0.… 0.08*** (0.04, 0.… 0.07*** (0.03, 0.…
## 8 wealth_ind… 0.54*** (0.45, 0.… 0.95 (0.77, 1.… 0.94 (0.76, 1.…
## 9 wealth_ind… 0.38*** (0.31, 0.… 0.69** (0.54, 0.… 0.69** (0.54, 0.…
## 10 wealth_ind… 0.47*** (0.37, 0.… 0.85 (0.65, 1.… 0.86 (0.65, 1.…
## # ℹ 18 more rows
(Intercept) Odds Ratios and Significance:Area2:Education Level Categories:Wealth Index Categories:Health Insurance:Current Contraceptive Use:Awareness of HIV/AIDS:Access to Media:This variable shows no consistent or significant association with the outcome across the three models.
Region Variables in Models 2 and 3:Ethnicity Categories in Models 2 and 3:ethnicity4, which has very high ORs and is
highly significant.ethnicity:access_to_media in Model
3:# VIFs check (A VIF value > 5 indicates high multicollinearity)
# Base model
model_1_vif <- vif(base_model_no_impute)
print(model_1_vif)
## GVIF Df GVIF^(1/(2*Df))
## area 1.178945 1 1.085793
## education_level 1.745961 5 1.057313
## wealth_index 1.613551 4 1.061629
## health_insurance 1.029280 1 1.014534
## current_contraceptive_use 1.009914 1 1.004945
## awareness_hiv_aids 1.483532 1 1.218003
## access_to_media 1.216401 1 1.102906
# Enhanced model
model_2_vif <- vif(enhanced_model_no_impute)
print(model_2_vif)
## GVIF Df GVIF^(1/(2*Df))
## area 1.311056 1 1.145014
## region 5.810682 5 1.192402
## education_level 2.259331 5 1.084921
## ethnicity 7.849140 3 1.409733
## wealth_index 2.856573 4 1.140199
## health_insurance 1.087155 1 1.042667
## current_contraceptive_use 1.025894 1 1.012864
## awareness_hiv_aids 1.784748 1 1.335945
## access_to_media 1.260862 1 1.122881
# Interacton model
model_3_vif <- vif(interaction_model_no_impute)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
print(model_3_vif)
## GVIF Df GVIF^(1/(2*Df))
## area 17.655196 1 4.201809
## region 5.841703 5 1.193037
## education_level 2372.303552 5 2.175289
## ethnicity 8.072108 3 1.416330
## wealth_index 2.921655 4 1.143414
## health_insurance 1.088687 1 1.043402
## current_contraceptive_use 1.026229 1 1.013030
## awareness_hiv_aids 1.783196 1 1.335364
## access_to_media 1.259876 1 1.122442
## area:education_level 3313.050828 5 2.249173
region and ethnicity,
with a mild increase in multicollinearity, but not at alarming
levels.area, education_level and its interaction
with one another`; their VIF values are significantly higher (around
4.2, 2.17, and 2.25, respectively). Although these values are below 5,
the increase is substantial compared to the previous models and could
start to affect the reliability and interpretability of the regression
coefficients for these variables.# Model 1 vs. Model 2
anova(base_model_no_impute, enhanced_model_no_impute, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: child_marriage ~ area + education_level + wealth_index + health_insurance +
## current_contraceptive_use + awareness_hiv_aids + access_to_media
## Model 2: child_marriage ~ area + region + education_level + ethnicity +
## wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 7576 6086.8
## 2 7568 5849.7 8 237.05 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The Model 2 provides a significantly better fit to the data compared to the Model 1, as indicated by the large decrease in residual deviance and the very small p-value (< 2.2e-16).
# Model 1 vs. Model 3
anova(base_model_no_impute, interaction_model_no_impute, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: child_marriage ~ area + education_level + wealth_index + health_insurance +
## current_contraceptive_use + awareness_hiv_aids + access_to_media
## Model 2: child_marriage ~ area + region + education_level + ethnicity +
## wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media + area:education_level
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 7576 6086.8
## 2 7563 5846.0 13 240.78 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The Model 3 provides a significantly better fit to the data compared to the Model 1, as indicated by the large decrease in residual deviance and the very small p-value (< 2.2e-16).
# Model 2 vs. Model 3
anova(enhanced_model_no_impute, interaction_model_no_impute, test="Chisq")
## Analysis of Deviance Table
##
## Model 1: child_marriage ~ area + region + education_level + ethnicity +
## wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media
## Model 2: child_marriage ~ area + region + education_level + ethnicity +
## wealth_index + health_insurance + current_contraceptive_use +
## awareness_hiv_aids + access_to_media + area:education_level
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 7568 5849.7
## 2 7563 5846.0 5 3.7254 0.5896
The residual deviance in Model 3 is slightly lower compared to Model 1, indicating a potential improvement in model fit but not significant.
The p-value of 0.5896 is not below the common significance level.
This indicates that the addition of the interaction term
area:education_level in Model 3 does not significantly
improve the model fit compared to Model 2. In simpler terms, the
interaction between area and education_level
does not seem to have a significant effect on predicting child
marriage.
# Create a data frame to hold AIC and BIC values
aic_bic_comparison <- data.frame(
Model = c("Model 1", "Model 2", "Model 3"),
AIC = c(AIC(base_model_no_impute), AIC(enhanced_model_no_impute), AIC(interaction_model_no_impute)),
BIC = c(BIC(base_model_no_impute), BIC(enhanced_model_no_impute), BIC(interaction_model_no_impute))
)
# Print the table
print(aic_bic_comparison)
## Model AIC BIC
## 1 Model 1 6116.798 6220.819
## 2 Model 2 5895.745 6055.243
## 3 Model 3 5902.019 6096.191
Model 2 has the lowest AIC and BIC values of all, indicating that it might be the best model among the three in terms of balancing fit and complexity.
# 1. Hosmer-Lemeshow Test for the Model 1
hoslem.test(base_model_no_impute$y, fitted(base_model_no_impute), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: base_model_no_impute$y, fitted(base_model_no_impute)
## X-squared = 15.295, df = 8, p-value = 0.05366
# 2. Hosmer-Lemeshow Test for the Model 2 (Baseline + Fixed Effects)
hoslem.test(enhanced_model_no_impute$y, fitted(enhanced_model_no_impute), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: enhanced_model_no_impute$y, fitted(enhanced_model_no_impute)
## X-squared = 10.076, df = 8, p-value = 0.2597
# 3. Hosmer-Lemeshow Test for the Model 3 (Baseline + Fixed Effects + Interaction Terms)
hoslem.test(interaction_model_no_impute$y, fitted(interaction_model_no_impute), g=10)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: interaction_model_no_impute$y, fitted(interaction_model_no_impute)
## X-squared = 6.9744, df = 8, p-value = 0.5394
A large p-value (>0.05) indicates a good fit, meaning that there’s no significant difference between the observed and predicted values. Through each model, the p-value increases which suggests that our decision to include fixed effects and interaction terms are significant.
binnedplot(fitted(interaction_model_no_impute),
residuals(interaction_model_no_impute, type = "response"),
nclass = NULL,
xlab = "Expected Values",
ylab = "Average Residuals",
main = "Binned Residual Plot",
cex.pts = 1,
col.int = "gray")
invisible(plot(roc(og_df$child_marriage,
fitted(base_model_no_impute)),
col = "red",
main = "ROC Curve: \nModel I (Red) vs. Model II (Green) vs. Model III (Blue)"))
invisible(plot(roc(og_df$child_marriage,
fitted(enhanced_model_no_impute)),
col = "green",
add = T))
invisible(plot(roc(og_df$child_marriage,
fitted(interaction_model_no_impute)),
print.auc = T,
col = "blue",
add = T))
# For each model
roc_model_1 <- roc(og_df$child_marriage, fitted(base_model_no_impute))
auc_mod_1 <- auc(roc_model_1)
roc_model_2 <- roc(og_df$child_marriage, fitted(enhanced_model_no_impute))
auc_mod_2 <- auc(roc_model_2)
roc_model_3 <- roc(og_df$child_marriage, fitted(interaction_model_no_impute))
auc_mod_3 <- auc(roc_model_3)
# Compare AUC values
print(paste("AUC Model 1:", auc_mod_1))
## [1] "AUC Model 1: 0.78337409946712"
print(paste("AUC Model 2:", auc_mod_2))
## [1] "AUC Model 2: 0.79976685814396"
print(paste("AUC Model 3:", auc_mod_3))
## [1] "AUC Model 3: 0.800031108041481"
An AUC of 0.5 represents a model with no discriminative ability (akin to random guessing), while an AUC of 1.0 represents perfect discrimination. So, my model is performing substantially better than random guessing.
The AUC of 0.7 indicates that the Model 1 has good discriminative ability. In other words, it is capable of distinguishing between cases and controls with a high degree of accuracy.
This model shows a slight improvement in AUC over the Model 1. The increase suggests that the additional variables (or adjustments) I made in the Model 2 contribute positively to its ability to differentiate between cases and controls. The difference in AUC between the Model 1 and Model 2, while modest, is still meaningful, especially in practical, real-world contexts.
The Model 3 shows a very slight improvement in AUC over the Model 2. This indicates that adding interaction terms provides a marginal improvement in the model’s discriminatory power. However, the improvement is very minimal, which aligns with my earlier findings that the interaction terms did not significantly improve the model fit.
All models demonstrate good ability to distinguish between cases and controls. An AUC greater than 0.7 is generally considered acceptable, and my models are above 0.7 and model 3 equals to 0.8. The Model 2 and Model 3 only show marginal improvements in AUC compared to the Model 1. This suggests that while the additional complexity (more variables and interaction terms) does contribute slightly to model performance, the gains are not substantial.